# load raw data files
data <- read.csv("../data/filledDatabase.csv")[,-c(2:9,11:13)]
# clean data
data <- clean_data(data) %>% collapse_data()
# separate compound and group_cate from the predictors
compound <- data$Compound
group_cat <- data$GroupCat
space_group <- data$SpaceGroup
# create data constructed by first 13 PC's
data <- select(data, -c("Compound","X","Z","SpaceGroup","SpaceGroupNumber"))
# data_pca <- get_pc_space(data[,-1], k = 13) %>% scale() %>% data.frame()
# split data into 5 folds for cross validation later
folds <- caret::createFolds(1:nrow(data), k = 5, list = TRUE, returnTrain = FALSE)
Multinomial Regression
library(glmnet)
X = data[,-1] %>% as.matrix()
Y = data$GroupCat %>% as.matrix()
Coefficient
Ridge
ridge_cv <- cv.glmnet(x = X, y = Y, alpha = 0, nfolds = 5, type.measure = "deviance", family = "multinomial")
ridge_cv %>%
get_coef(tuning_parameter = ridge_cv$lambda.min) %>%
select(feature, Cubic, Tilted, Others) %>%
filter(feature != "(Intercept)") %>%
plot_coef()

LASSO
lasso_cv <- cv.glmnet(x = X, y = Y, alpha = 1, nfolds = 5, type.measure = "deviance", family = "multinomial")
lasso_cv %>%
get_coef(tuning_parameter = lasso_cv$lambda.min) %>%
select(feature, Cubic, Tilted, Others) %>%
filter(feature != "(Intercept)") %>%
plot_coef()

Elastic Net
library(caret)
elastic_cv <-
train(GroupCat ~., data = data, method = "glmnet",
trControl = trainControl("cv", number = 5),
tuneLength = 10
)
elastic_cv$finalModel %>%
get_coef(tuning_parameter = elastic_cv$bestTune$lambda) %>%
select(feature, Cubic, Tilted, Others) %>%
filter(feature != "(Intercept)") %>%
plot_coef()

Accurate classification rate
Ridge
tb_ridge = prediction_table(alpha = 0, lambda = ridge_cv$lambda.min)
tb_ridge$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.8533333
|
0.9054054
|
0.8513514
|
0.8108108
|
0.7567568
|
0.8355315
|
tb_ridge$t %>% highlight_tb_count()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
158
|
5
|
28
|
|
Others
|
0
|
28
|
3
|
|
Tilted
|
21
|
4
|
124
|
|
Total
|
179
|
37
|
155
|
tb_ridge$t %>% highlight_tb_percent()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
0.88
|
0.14
|
0.18
|
|
Others
|
0
|
0.76
|
0.02
|
|
Tilted
|
0.12
|
0.11
|
0.8
|
|
Total
|
100%
|
100%
|
100%
|
LASSO
tb_lasso = prediction_table(alpha = 1, lambda = lasso_cv$lambda.min)
tb_lasso$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.8666667
|
0.8918919
|
0.8648649
|
0.8513514
|
0.7972973
|
0.8544144
|
tb_lasso$t %>% highlight_tb_count()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
159
|
5
|
24
|
|
Others
|
0
|
28
|
1
|
|
Tilted
|
20
|
4
|
130
|
|
Total
|
179
|
37
|
155
|
tb_lasso$t %>% highlight_tb_percent()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
0.89
|
0.14
|
0.15
|
|
Others
|
0
|
0.76
|
0.01
|
|
Tilted
|
0.11
|
0.11
|
0.84
|
|
Total
|
100%
|
100%
|
100%
|
Elastic Net
tb_elastic = prediction_table(alpha = elastic_cv$bestTune[[1]], lambda = elastic_cv$bestTune[[2]])
tb_elastic$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.8933333
|
0.8378378
|
0.8378378
|
0.8243243
|
0.8243243
|
0.8435315
|
tb_elastic$t %>% highlight_tb_count()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
155
|
3
|
24
|
|
Others
|
4
|
29
|
2
|
|
Tilted
|
20
|
5
|
129
|
|
Total
|
179
|
37
|
155
|
tb_elastic$t %>% highlight_tb_percent()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
0.87
|
0.08
|
0.15
|
|
Others
|
0.02
|
0.78
|
0.01
|
|
Tilted
|
0.11
|
0.14
|
0.83
|
|
Total
|
100%
|
100%
|
100%
|